In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf

import plotly
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

#print(__version__) # requires version >= 1.9.0

# For Notebooks
init_notebook_mode(connected=True)

# For offline use
cf.go_offline()

%matplotlib inline
In [2]:
school_frame = pd.read_csv('2016 School Explorer.csv')
shsat_frame = pd.read_csv("D5 SHSAT Registrations and Testers.csv")
In [3]:
pd.set_option('display.max_columns', None)  
school_frame.head()
Out[3]:
Adjusted Grade New? Other Location Code in LCGMS School Name SED Code Location Code District Latitude Longitude Address (Full) City Zip Grades Grade Low Grade High Community School? Economic Need Index School Income Estimate Percent ELL Percent Asian Percent Black Percent Hispanic Percent Black / Hispanic Percent White Student Attendance Rate Percent of Students Chronically Absent Rigorous Instruction % Rigorous Instruction Rating Collaborative Teachers % Collaborative Teachers Rating Supportive Environment % Supportive Environment Rating Effective School Leadership % Effective School Leadership Rating Strong Family-Community Ties % Strong Family-Community Ties Rating Trust % Trust Rating Student Achievement Rating Average ELA Proficiency Average Math Proficiency Grade 3 ELA - All Students Tested Grade 3 ELA 4s - All Students Grade 3 ELA 4s - American Indian or Alaska Native Grade 3 ELA 4s - Black or African American Grade 3 ELA 4s - Hispanic or Latino Grade 3 ELA 4s - Asian or Pacific Islander Grade 3 ELA 4s - White Grade 3 ELA 4s - Multiracial Grade 3 ELA 4s - Limited English Proficient Grade 3 ELA 4s - Economically Disadvantaged Grade 3 Math - All Students tested Grade 3 Math 4s - All Students Grade 3 Math 4s - American Indian or Alaska Native Grade 3 Math 4s - Black or African American Grade 3 Math 4s - Hispanic or Latino Grade 3 Math 4s - Asian or Pacific Islander Grade 3 Math 4s - White Grade 3 Math 4s - Multiracial Grade 3 Math 4s - Limited English Proficient Grade 3 Math 4s - Economically Disadvantaged Grade 4 ELA - All Students Tested Grade 4 ELA 4s - All Students Grade 4 ELA 4s - American Indian or Alaska Native Grade 4 ELA 4s - Black or African American Grade 4 ELA 4s - Hispanic or Latino Grade 4 ELA 4s - Asian or Pacific Islander Grade 4 ELA 4s - White Grade 4 ELA 4s - Multiracial Grade 4 ELA 4s - Limited English Proficient Grade 4 ELA 4s - Economically Disadvantaged Grade 4 Math - All Students Tested Grade 4 Math 4s - All Students Grade 4 Math 4s - American Indian or Alaska Native Grade 4 Math 4s - Black or African American Grade 4 Math 4s - Hispanic or Latino Grade 4 Math 4s - Asian or Pacific Islander Grade 4 Math 4s - White Grade 4 Math 4s - Multiracial Grade 4 Math 4s - Limited English Proficient Grade 4 Math 4s - Economically Disadvantaged Grade 5 ELA - All Students Tested Grade 5 ELA 4s - All Students Grade 5 ELA 4s - American Indian or Alaska Native Grade 5 ELA 4s - Black or African American Grade 5 ELA 4s - Hispanic or Latino Grade 5 ELA 4s - Asian or Pacific Islander Grade 5 ELA 4s - White Grade 5 ELA 4s - Multiracial Grade 5 ELA 4s - Limited English Proficient Grade 5 ELA 4s - Economically Disadvantaged Grade 5 Math - All Students Tested Grade 5 Math 4s - All Students Grade 5 Math 4s - American Indian or Alaska Native Grade 5 Math 4s - Black or African American Grade 5 Math 4s - Hispanic or Latino Grade 5 Math 4s - Asian or Pacific Islander Grade 5 Math 4s - White Grade 5 Math 4s - Multiracial Grade 5 Math 4s - Limited English Proficient Grade 5 Math 4s - Economically Disadvantaged Grade 6 ELA - All Students Tested Grade 6 ELA 4s - All Students Grade 6 ELA 4s - American Indian or Alaska Native Grade 6 ELA 4s - Black or African American Grade 6 ELA 4s - Hispanic or Latino Grade 6 ELA 4s - Asian or Pacific Islander Grade 6 ELA 4s - White Grade 6 ELA 4s - Multiracial Grade 6 ELA 4s - Limited English Proficient Grade 6 ELA 4s - Economically Disadvantaged Grade 6 Math - All Students Tested Grade 6 Math 4s - All Students Grade 6 Math 4s - American Indian or Alaska Native Grade 6 Math 4s - Black or African American Grade 6 Math 4s - Hispanic or Latino Grade 6 Math 4s - Asian or Pacific Islander Grade 6 Math 4s - White Grade 6 Math 4s - Multiracial Grade 6 Math 4s - Limited English Proficient Grade 6 Math 4s - Economically Disadvantaged Grade 7 ELA - All Students Tested Grade 7 ELA 4s - All Students Grade 7 ELA 4s - American Indian or Alaska Native Grade 7 ELA 4s - Black or African American Grade 7 ELA 4s - Hispanic or Latino Grade 7 ELA 4s - Asian or Pacific Islander Grade 7 ELA 4s - White Grade 7 ELA 4s - Multiracial Grade 7 ELA 4s - Limited English Proficient Grade 7 ELA 4s - Economically Disadvantaged Grade 7 Math - All Students Tested Grade 7 Math 4s - All Students Grade 7 Math 4s - American Indian or Alaska Native Grade 7 Math 4s - Black or African American Grade 7 Math 4s - Hispanic or Latino Grade 7 Math 4s - Asian or Pacific Islander Grade 7 Math 4s - White Grade 7 Math 4s - Multiracial Grade 7 Math 4s - Limited English Proficient Grade 7 Math 4s - Economically Disadvantaged Grade 8 ELA - All Students Tested Grade 8 ELA 4s - All Students Grade 8 ELA 4s - American Indian or Alaska Native Grade 8 ELA 4s - Black or African American Grade 8 ELA 4s - Hispanic or Latino Grade 8 ELA 4s - Asian or Pacific Islander Grade 8 ELA 4s - White Grade 8 ELA 4s - Multiracial Grade 8 ELA 4s - Limited English Proficient Grade 8 ELA 4s - Economically Disadvantaged Grade 8 Math - All Students Tested Grade 8 Math 4s - All Students Grade 8 Math 4s - American Indian or Alaska Native Grade 8 Math 4s - Black or African American Grade 8 Math 4s - Hispanic or Latino Grade 8 Math 4s - Asian or Pacific Islander Grade 8 Math 4s - White Grade 8 Math 4s - Multiracial Grade 8 Math 4s - Limited English Proficient Grade 8 Math 4s - Economically Disadvantaged
0 NaN NaN NaN P.S. 015 ROBERTO CLEMENTE 310100010015 01M015 1 40.721834 -73.978766 333 E 4TH ST NEW YORK, NY 10009 NEW YORK 10009 PK,0K,01,02,03,04,05 PK 05 Yes 0.919 $31,141.72 9% 5% 32% 60% 92% 1% 94% 18% 89% Meeting Target 94% Meeting Target 86% Exceeding Target 91% Exceeding Target 85% Meeting Target 94% Exceeding Target Approaching Target 2.14 2.17 20 0 0 0 0 0 0 0 0 0 21 0 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0 0 0 0 15 2 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 NaN NaN NaN P.S. 019 ASHER LEVY 310100010019 01M019 1 40.729892 -73.984231 185 1ST AVE NEW YORK, NY 10003 NEW YORK 10003 PK,0K,01,02,03,04,05 PK 05 No 0.641 $56,462.88 5% 10% 20% 63% 83% 6% 92% 30% 96% NaN 96% NaN 97% NaN 90% Exceeding Target 86% Meeting Target 94% Meeting Target Exceeding Target 2.63 2.98 33 2 0 1 1 0 0 0 0 0 33 6 0 2 1 0 0 0 0 4 29 5 0 0 3 0 0 0 0 3 28 10 0 0 6 0 0 0 0 8 32 7 0 3 1 2 0 0 0 6 32 4 0 0 1 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 NaN NaN NaN P.S. 020 ANNA SILVER 310100010020 01M020 1 40.721274 -73.986315 166 ESSEX ST NEW YORK, NY 10002 NEW YORK 10002 PK,0K,01,02,03,04,05 PK 05 No 0.744 $44,342.61 15% 35% 8% 49% 57% 4% 94% 20% 87% Meeting Target 77% Meeting Target 82% Approaching Target 61% Not Meeting Target 80% Approaching Target 79% Not Meeting Target Approaching Target 2.39 2.54 76 6 0 0 0 4 0 0 0 2 76 11 0 0 3 7 0 0 0 6 70 9 0 0 1 6 2 0 0 1 71 13 0 0 0 11 2 0 0 4 73 2 0 0 1 1 0 0 0 0 73 10 0 0 1 9 0 0 1 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 NaN NaN NaN P.S. 034 FRANKLIN D. ROOSEVELT 310100010034 01M034 1 40.726147 -73.975043 730 E 12TH ST NEW YORK, NY 10009 NEW YORK 10009 PK,0K,01,02,03,04,05,06,07,08 PK 08 No 0.860 $31,454.00 7% 5% 29% 63% 92% 4% 92% 28% 85% Approaching Target 78% Meeting Target 82% Meeting Target 73% Approaching Target 89% Meeting Target 88% Meeting Target Exceeding Target 2.48 2.47 27 0 0 0 0 0 0 0 0 0 29 4 0 0 2 0 0 0 0 0 35 1 0 0 1 0 0 0 0 0 34 1 0 0 1 0 0 0 0 0 29 0 0 0 0 0 0 0 0 0 29 1 0 0 1 0 0 0 0 0 54 3 0 0 1 0 0 0 0 3 54 3 0 0 0 0 0 0 0 3 55 4 0 0 3 0 0 0 0 0 55 3 0 0 3 0 0 0 0 0 47 1 0 0 0 0 0 0 0 0 48 1 0 0 0 0 0 0 0 0
4 NaN NaN NaN THE STAR ACADEMY - P.S.63 310100010063 01M063 1 40.724404 -73.986360 121 E 3RD ST NEW YORK, NY 10009 NEW YORK 10009 PK,0K,01,02,03,04,05 PK 05 No 0.730 $46,435.59 3% 4% 20% 65% 84% 10% 93% 23% 90% Meeting Target 88% Meeting Target 87% Meeting Target 81% Meeting Target 89% Meeting Target 93% Meeting Target Meeting Target 2.38 2.54 21 2 0 0 2 0 0 0 0 0 21 5 0 0 2 0 0 0 0 2 15 2 0 1 0 0 0 0 0 0 15 3 0 1 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 1 12 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [4]:
school_frame.shape
Out[4]:
(1272, 161)
In [5]:
school_frame.columns
Out[5]:
Index(['Adjusted Grade', 'New?', 'Other Location Code in LCGMS', 'School Name',
       'SED Code', 'Location Code', 'District', 'Latitude', 'Longitude',
       'Address (Full)',
       ...
       'Grade 8 Math - All Students Tested', 'Grade 8 Math 4s - All Students',
       'Grade 8 Math 4s - American Indian or Alaska Native',
       'Grade 8 Math 4s - Black or African American',
       'Grade 8 Math 4s - Hispanic or Latino',
       'Grade 8 Math 4s - Asian or Pacific Islander',
       'Grade 8 Math 4s - White', 'Grade 8 Math 4s - Multiracial',
       'Grade 8 Math 4s - Limited English Proficient',
       'Grade 8 Math 4s - Economically Disadvantaged'],
      dtype='object', length=161)
In [6]:
# Function converting string % values to int
def percent_to_int(df_in):
    for col in df_in.columns.values:
        if col.startswith("Percent") or col.endswith("%") or col.endswith("Rate"):
            df_in[col] = df_in[col].astype(np.object).str.replace('%', '').astype(float)
    return df_in
In [7]:
school_frame = percent_to_int(school_frame)
In [8]:
#Filtering out non Middle Schools
school_frame_raw = school_frame[school_frame['Grade High'] != '0K'] 
#school_frame_relevant_grade = school_frame[school_frame['Grade High'].astype(int) > 5]

#Cleaning School Income Estimate
school_frame_raw['School Income Estimate'] = school_frame['School Income Estimate'].astype(np.object).str.replace('$', '').str.replace(',', '').str.replace('.', '').astype(float)



#Removing >95% null columns
high_nan_columns = school_frame_raw.columns[school_frame_raw.isnull().mean()>0.95]
#school_frame = school_frame_relevant_grade.drop(high_nan_columns, axis=1)
#print(list(high_bnan_columns))

#school_frame_raw contains all grades while school_frame contains SHSAT eligible middle schools
school_frame_relevant_grades = school_frame_raw[school_frame_raw['Grade High'].astype(int) > 5]
school_frame = school_frame_relevant_grades.drop(high_nan_columns, axis=1)

#Filling null values which prevent calculations
school_frame['School Income Estimate'] = school_frame['School Income Estimate'].fillna(0)
school_frame['Economic Need Index'] = school_frame['Economic Need Index'].fillna(0)

print("We have %d relevant schools and %d fields describing the school/ students"%(school_frame.shape))
We have 626 relevant schools and 158 fields describing the school/ students
C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [9]:
#Distributions of schools based on city
schools_by_city = school_frame['City'].value_counts()
schools_by_city.iplot(kind='bar',yTitle='Number of SHSAT Eligible Schools', title='SHSAT Eligible Schools by City', filename='cufflinks/categorical-bar-chart')
In [10]:
#Analysis of Chronically Absent Students

school_frame = school_frame.dropna(subset = ['Percent of Students Chronically Absent'])
temp = sns.distplot(school_frame[['Percent of Students Chronically Absent']].values, kde=False)
temp = plt.title('Distribution of Schools Based on Chronically Absent Students')
temp = plt.xlabel("Percent of students")
temp = plt.ylabel("Count")
In [11]:
chronically_absent_foi = ['School Name', 'Percent of Students Chronically Absent','Percent Black / Hispanic','Economic Need Index']

df_schools_ca = school_frame[school_frame['Percent of Students Chronically Absent'] > 40]
df_schools_nca = school_frame[school_frame['Percent of Students Chronically Absent'] < 5]

df_schools_ca[chronically_absent_foi].sort_values('Percent of Students Chronically Absent', ascending = False)
Out[11]:
School Name Percent of Students Chronically Absent Percent Black / Hispanic Economic Need Index
1261 SUCCESS ACADEMY CHARTER SCHOOL - BRONX 2 100.0 98.0 0.794
1260 SUCCESS ACADEMY CHARTER SCHOOL - BRONX 1 100.0 96.0 0.825
1206 SUCCESS ACADEMY CHARTER SCHOOL - UPPER WEST 100.0 48.0 0.369
1203 SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 5 100.0 95.0 0.789
1198 SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 4 100.0 94.0 0.711
1197 SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 3 100.0 94.0 0.720
1196 SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 2 100.0 97.0 0.734
1173 BROOKLYN LAB CHARTER SCHOOL 100.0 89.0 0.639
1121 SUCCESS ACADEMY CHARTER SCHOOL - BED-STUY 1 100.0 96.0 0.750
300 NEW DIRECTIONS SECONDARY SCHOOL 74.0 98.0 0.920
12 ORCHARD COLLEGIATE ACADEMY 58.0 84.0 0.832
631 P.S. 306 ETHAN ALLEN 57.0 98.0 0.875
527 DIGITAL ARTS AND CINEMA TECHNOLOGY HIGH SCHOOL 52.0 93.0 0.749
755 P.S. 165 IDA POSNER 50.0 97.0 0.815
110 P.S. 050 VITO MARCANTONIO 49.0 98.0 0.890
135 P.S. 123 MAHALIA JACKSON 49.0 93.0 0.835
874 P.S./M.S 042 R. VERNAM 48.0 95.0 0.825
757 P.S. 184 NEWPORT 48.0 98.0 0.904
432 EAST BRONX ACADEMY FOR THE FUTURE 48.0 98.0 0.853
244 M.S. 302 LUISA DESSUS CRUZ 48.0 98.0 0.906
139 EAGLE ACADEMY FOR YOUNG MEN OF HARLEM 47.0 98.0 0.747
761 P.S. 327 DR. ROSE B. ENGLISH 47.0 96.0 0.851
469 SATELLITE WEST MIDDLE SCHOOL 47.0 94.0 0.859
466 SATELLITE EAST MIDDLE SCHOOL 47.0 78.0 0.856
548 M.S. 267 MATH, SCIENCE & TECHNOLOGY 46.0 87.0 0.826
714 P.S. 288 THE SHIRLEY TANYHILL 46.0 90.0 0.912
402 LEADERS OF TOMORROW 46.0 90.0 0.771
756 P.S. 178 SAINT CLAIR MCKELWAY 46.0 95.0 0.884
243 M.S. 301 PAUL L. DUNBAR 45.0 98.0 0.823
1008 P.S. 111 JACOB BLACKWELL 45.0 89.0 0.847
294 BRONX WRITING ACADEMY 45.0 95.0 0.892
549 P.S. 308 CLARA CARDWELL 44.0 96.0 0.828
257 P.S./M.S. 004 CROTONA PARK WEST 44.0 98.0 0.903
767 MOTT HALL IV 44.0 98.0 0.771
900 P.S. 183 DR. RICHARD R. GREEN 43.0 87.0 0.839
277 J.H.S. 145 ARTURO TOSCANINI 43.0 98.0 0.888
185 M.S. 328 - MANHATTAN MIDDLE SCHOOL FOR SCIENTI... 43.0 97.0 0.834
753 P.S./ I.S. 155 NICHOLAS HERKIMER 43.0 96.0 0.877
751 P.S./I.S. 137 RACHEL JEAN MITCHELL 43.0 94.0 0.807
649 HIGHLAND PARK COMMUNITY SCHOOL 43.0 96.0 0.818
265 P.S. 058 43.0 97.0 0.910
762 BROWNSVILLE COLLABORATIVE MIDDLE SCHOOL 42.0 96.0 0.925
293 I.S. 313 SCHOOL OF LEADERSHIP DEVELOPMENT 42.0 99.0 0.881
750 P.S. 041 FRANCIS WHITE 42.0 95.0 0.900
77 P.S. 076 A. PHILIP RANDOLPH 42.0 96.0 0.850
146 URBAN ASSEMBLY ACADEMY FOR FUTURE LEADERS 42.0 98.0 0.817
440 URBAN ASSEMBLY SCHOOL FOR WILDLIFE CONSERVATION 42.0 96.0 0.809
552 M.S. 584 41.0 97.0 0.885
626 J.H.S. 218 JAMES P. SINNOTT 41.0 79.0 0.857
In [12]:
plt.figure(figsize = [16,7])
plt.suptitle('Statistics for Schools with High Chronically Absent Percentage', fontsize=15)
plt.subplot(1,2,1)
temp = sns.distplot(df_schools_ca[['Economic Need Index']].values, kde=False)
temp = plt.xlabel("Economic Need Index", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)
plt.subplot(1,2,2)
temp = sns.distplot(df_schools_ca[['Percent Black / Hispanic']].values, kde=False)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)
In [13]:
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Economic Need Index"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York School Population (Economic Need Index)',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    ))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels.html')
In [14]:
#Black Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Black"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Black Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
In [15]:
#Hispanic Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Hispanic"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Hispanic Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
In [16]:
#combination of black and latino
#bl_frame = school_frame["Percent Hispanic"] + school_frame["Percent Black"] 

#Hispanic Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Black / Hispanic"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Black/Hispanic Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')
In [17]:
lower_bound = school_frame['Economic Need Index'].quantile(0.75)
high_eni_schools = school_frame[school_frame['Economic Need Index'] > lower_bound]
low_eni_schools = school_frame[school_frame['Economic Need Index'] < 0.25]
In [18]:
high_eni_schools[['School Name', 'Economic Need Index', 'Percent Black / Hispanic']].sort_values('Economic Need Index', ascending = False).head(10)
Out[18]:
School Name Economic Need Index Percent Black / Hispanic
298 NEW MILLENNIUM BUSINESS ACADEMY MIDDLE SCHOOL 0.938 98.0
283 I.S. 219 NEW VENTURE SCHOOL 0.929 98.0
1247 MOTT HAVEN ACADEMY CHARTER SCHOOL 0.928 98.0
762 BROWNSVILLE COLLABORATIVE MIDDLE SCHOOL 0.925 96.0
300 NEW DIRECTIONS SECONDARY SCHOOL 0.920 98.0
346 P.S. 279 CAPTAIN MANUEL RIVERA, JR. 0.918 96.0
11 P.S. 188 THE ISLAND SCHOOL 0.917 93.0
299 I.S. 339 0.913 98.0
714 P.S. 288 THE SHIRLEY TANYHILL 0.912 90.0
363 M.S. 390 0.911 98.0
In [19]:
print ("Average ELA / Math performance for the high ENI schools")
high_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
Average ELA / Math performance for the high ENI schools
Out[19]:
Average ELA Proficiency     2.247792
Average Math Proficiency    2.261688
dtype: float64
In [20]:
print ("Average ELA / Math performance for the low ENI schools")
low_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
Average ELA / Math performance for the low ENI schools
Out[20]:
Average ELA Proficiency     3.267917
Average Math Proficiency    3.422083
dtype: float64
In [21]:
#After school programs
#Start by looking at which schools have the lowest income estimates. 
income_estimate = school_frame[['School Name','School Income Estimate']].sort_values(by='School Income Estimate',ascending=True)

#There are several null columns here.. this makes it difficult to use this category as a method of predicting need

null_columns=income_estimate.columns[income_estimate.isnull().any()]
income_estimate[null_columns].isnull().sum()
Out[21]:
Series([], dtype: float64)
In [22]:
#Lets see how many null columns in the entire school frame
#There is a considerable amount missing (31%) in the School Income Estimate, leading me to want to avoid using it
null_columns=school_frame.columns[school_frame.isnull().any()]
school_frame[null_columns].isnull().sum()
Out[22]:
Rigorous Instruction Rating            24
Collaborative Teachers Rating          24
Supportive Environment Rating          28
Effective School Leadership Rating     22
Strong Family-Community Ties Rating    23
Trust Rating                           23
Student Achievement Rating             22
Average ELA Proficiency                 7
Average Math Proficiency                7
dtype: int64
In [23]:
#Lets see how many null columns in the entire school frame
#Conclusion: Doesn't seem to be any
null_columns=shsat_frame.columns[shsat_frame.isnull().any()]
shsat_frame[null_columns].isnull().sum()
Out[23]:
Series([], dtype: float64)
In [24]:
column_list = school_frame.columns.values

for i,a in enumerate(column_list):
   print('%s is %d' % (a, i))
School Name is 0
SED Code is 1
Location Code is 2
District is 3
Latitude is 4
Longitude is 5
Address (Full) is 6
City is 7
Zip is 8
Grades is 9
Grade Low is 10
Grade High is 11
Community School? is 12
Economic Need Index is 13
School Income Estimate is 14
Percent ELL is 15
Percent Asian is 16
Percent Black is 17
Percent Hispanic is 18
Percent Black / Hispanic is 19
Percent White is 20
Student Attendance Rate is 21
Percent of Students Chronically Absent is 22
Rigorous Instruction % is 23
Rigorous Instruction Rating is 24
Collaborative Teachers % is 25
Collaborative Teachers Rating is 26
Supportive Environment % is 27
Supportive Environment Rating is 28
Effective School Leadership % is 29
Effective School Leadership Rating is 30
Strong Family-Community Ties % is 31
Strong Family-Community Ties Rating is 32
Trust % is 33
Trust Rating is 34
Student Achievement Rating is 35
Average ELA Proficiency is 36
Average Math Proficiency is 37
Grade 3 ELA - All Students Tested is 38
Grade 3 ELA 4s - All Students is 39
Grade 3 ELA 4s - American Indian or Alaska Native is 40
Grade 3 ELA 4s - Black or African American is 41
Grade 3 ELA 4s - Hispanic or Latino is 42
Grade 3 ELA 4s - Asian or Pacific Islander is 43
Grade 3 ELA 4s - White is 44
Grade 3 ELA 4s - Multiracial is 45
Grade 3 ELA 4s - Limited English Proficient is 46
Grade 3 ELA 4s - Economically Disadvantaged is 47
Grade 3 Math - All Students tested is 48
Grade 3 Math 4s - All Students is 49
Grade 3 Math 4s - American Indian or Alaska Native is 50
Grade 3 Math 4s - Black or African American is 51
Grade 3 Math 4s - Hispanic or Latino is 52
Grade 3 Math 4s - Asian or Pacific Islander is 53
Grade 3 Math 4s - White is 54
Grade 3 Math 4s - Multiracial is 55
Grade 3 Math 4s - Limited English Proficient is 56
Grade 3 Math 4s - Economically Disadvantaged is 57
Grade 4 ELA - All Students Tested is 58
Grade 4 ELA 4s - All Students is 59
Grade 4 ELA 4s - American Indian or Alaska Native is 60
Grade 4 ELA 4s - Black or African American is 61
Grade 4 ELA 4s - Hispanic or Latino is 62
Grade 4 ELA 4s - Asian or Pacific Islander is 63
Grade 4 ELA 4s - White is 64
Grade 4 ELA 4s - Multiracial is 65
Grade 4 ELA 4s - Limited English Proficient is 66
Grade 4 ELA 4s - Economically Disadvantaged is 67
Grade 4 Math - All Students Tested is 68
Grade 4 Math 4s - All Students is 69
Grade 4 Math 4s - American Indian or Alaska Native is 70
Grade 4 Math 4s - Black or African American is 71
Grade 4 Math 4s - Hispanic or Latino is 72
Grade 4 Math 4s - Asian or Pacific Islander is 73
Grade 4 Math 4s - White is 74
Grade 4 Math 4s - Multiracial is 75
Grade 4 Math 4s - Limited English Proficient is 76
Grade 4 Math 4s - Economically Disadvantaged is 77
Grade 5 ELA - All Students Tested is 78
Grade 5 ELA 4s - All Students is 79
Grade 5 ELA 4s - American Indian or Alaska Native is 80
Grade 5 ELA 4s - Black or African American is 81
Grade 5 ELA 4s - Hispanic or Latino is 82
Grade 5 ELA 4s - Asian or Pacific Islander is 83
Grade 5 ELA 4s - White is 84
Grade 5 ELA 4s - Multiracial is 85
Grade 5 ELA 4s - Limited English Proficient is 86
Grade 5 ELA 4s - Economically Disadvantaged is 87
Grade 5 Math - All Students Tested is 88
Grade 5 Math 4s - All Students is 89
Grade 5 Math 4s - American Indian or Alaska Native is 90
Grade 5 Math 4s - Black or African American is 91
Grade 5 Math 4s - Hispanic or Latino is 92
Grade 5 Math 4s - Asian or Pacific Islander is 93
Grade 5 Math 4s - White is 94
Grade 5 Math 4s - Multiracial is 95
Grade 5 Math 4s - Limited English Proficient is 96
Grade 5 Math 4s - Economically Disadvantaged is 97
Grade 6 ELA - All Students Tested is 98
Grade 6 ELA 4s - All Students is 99
Grade 6 ELA 4s - American Indian or Alaska Native is 100
Grade 6 ELA 4s - Black or African American is 101
Grade 6 ELA 4s - Hispanic or Latino is 102
Grade 6 ELA 4s - Asian or Pacific Islander is 103
Grade 6 ELA 4s - White is 104
Grade 6 ELA 4s - Multiracial is 105
Grade 6 ELA 4s - Limited English Proficient is 106
Grade 6 ELA 4s - Economically Disadvantaged is 107
Grade 6 Math - All Students Tested is 108
Grade 6 Math 4s - All Students is 109
Grade 6 Math 4s - American Indian or Alaska Native is 110
Grade 6 Math 4s - Black or African American is 111
Grade 6 Math 4s - Hispanic or Latino is 112
Grade 6 Math 4s - Asian or Pacific Islander is 113
Grade 6 Math 4s - White is 114
Grade 6 Math 4s - Multiracial is 115
Grade 6 Math 4s - Limited English Proficient is 116
Grade 6 Math 4s - Economically Disadvantaged is 117
Grade 7 ELA - All Students Tested is 118
Grade 7 ELA 4s - All Students is 119
Grade 7 ELA 4s - American Indian or Alaska Native is 120
Grade 7 ELA 4s - Black or African American is 121
Grade 7 ELA 4s - Hispanic or Latino is 122
Grade 7 ELA 4s - Asian or Pacific Islander is 123
Grade 7 ELA 4s - White is 124
Grade 7 ELA 4s - Multiracial is 125
Grade 7 ELA 4s - Limited English Proficient is 126
Grade 7 ELA 4s - Economically Disadvantaged is 127
Grade 7 Math - All Students Tested is 128
Grade 7 Math 4s - All Students is 129
Grade 7 Math 4s - American Indian or Alaska Native is 130
Grade 7 Math 4s - Black or African American is 131
Grade 7 Math 4s - Hispanic or Latino is 132
Grade 7 Math 4s - Asian or Pacific Islander is 133
Grade 7 Math 4s - White is 134
Grade 7 Math 4s - Multiracial is 135
Grade 7 Math 4s - Limited English Proficient is 136
Grade 7 Math 4s - Economically Disadvantaged is 137
Grade 8 ELA - All Students Tested is 138
Grade 8 ELA 4s - All Students is 139
Grade 8 ELA 4s - American Indian or Alaska Native is 140
Grade 8 ELA 4s - Black or African American is 141
Grade 8 ELA 4s - Hispanic or Latino is 142
Grade 8 ELA 4s - Asian or Pacific Islander is 143
Grade 8 ELA 4s - White is 144
Grade 8 ELA 4s - Multiracial is 145
Grade 8 ELA 4s - Limited English Proficient is 146
Grade 8 ELA 4s - Economically Disadvantaged is 147
Grade 8 Math - All Students Tested is 148
Grade 8 Math 4s - All Students is 149
Grade 8 Math 4s - American Indian or Alaska Native is 150
Grade 8 Math 4s - Black or African American is 151
Grade 8 Math 4s - Hispanic or Latino is 152
Grade 8 Math 4s - Asian or Pacific Islander is 153
Grade 8 Math 4s - White is 154
Grade 8 Math 4s - Multiracial is 155
Grade 8 Math 4s - Limited English Proficient is 156
Grade 8 Math 4s - Economically Disadvantaged is 157
In [25]:
#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,15,16,17,18,19,20,21,22,36,37]]

corr = school_corr.corr()
corr = (corr)

fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            cmap="RdYlGn", center=0, annot=True)

#plt.savefig('img/correlation.png',bbox_inches='tight')
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x224669eb940>
In [26]:
#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,14,15,16,17,18,20,21,22]]

corr = school_corr.corr()
corr = (corr)

fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            cmap="RdYlGn", center=0, annot=True)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x22467279780>
In [27]:
#Test Prep and Tutoring

minority_frame = school_frame_raw[school_frame_raw['Percent Black / Hispanic'].astype(int)>70]
print(minority_frame.shape)
(829, 161)
In [28]:
def grade_minority_percent_4s (df, grade, subject):
    out_field = ('Grade %d %s Minority 4s')%(grade, subject)
    num1 = ('Grade %d %s 4s - Black or African American')%(grade, subject)
    num2 = ('Grade %d %s 4s - Hispanic or Latino')%(grade, subject)
    den = ('Grade %d %s 4s - All Students')%(grade, subject)
    df = df[df[den].astype(int)>5]
    df[out_field] = (df[num1] + df[num2])/(df[den])
    grade_minority_df = df[df[out_field]>0.7]
    print(grade, subject, df.shape, grade_minority_df.shape, (grade_minority_df[num1] + grade_minority_df[num2]).mean(), df[num1].mean(), df[num2].mean())
    return df
In [29]:
import itertools

grades = [5,6,7]
subjects = ['ELA', 'Math']
i=1
plt.figure(figsize=[19,10])
df_all_schools = pd.DataFrame(columns=minority_frame.columns)
for grade,subject in itertools.product(grades, subjects):
    df_schools_tmp = grade_minority_percent_4s(minority_frame, grade, subject)
    
#     df_schools_tmp[]
    fname = ('Grade %d %s Minority 4s')%(grade, subject)
    plt.subplot(3,2,i)
    df_schools_tmp[fname].hist()
    df_schools_tmp.drop([fname], axis=1, inplace=True)
    plt.title(fname)
    df_all_schools = pd.concat([df_all_schools, df_schools_tmp])
    i += 1
    
plt.savefig('img/minority4s.png',bbox_inches='tight')
5 ELA (133, 162) (80, 162) 9.5625 4.0 3.8345864661654137
C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

5 Math (175, 162) (107, 162) 10.897196261682243 4.377142857142857 5.428571428571429
6 ELA (143, 162) (93, 162) 12.698924731182796 6.2027972027972025 4.671328671328672
6 Math (185, 162) (122, 162) 17.639344262295083 7.383783783783784 7.335135135135135
7 ELA (104, 162) (61, 162) 11.60655737704918 4.0576923076923075 5.288461538461538
7 Math (115, 162) (70, 162) 15.142857142857142 5.426086956521739 6.7043478260869565
In [30]:
#SHSAT Analysis
shsat_frame.head()
Out[30]:
DBN School name Year of SHST Grade level Enrollment on 10/31 Number of students who registered for the SHSAT Number of students who took the SHSAT
0 05M046 P.S. 046 Arthur Tappan 2013 8 91 31 14
1 05M046 P.S. 046 Arthur Tappan 2014 8 95 26 7
2 05M046 P.S. 046 Arthur Tappan 2015 8 73 21 10
3 05M046 P.S. 046 Arthur Tappan 2016 8 56 20 8
4 05M123 P.S. 123 Mahalia Jackson 2013 8 62 31 14
In [31]:
shsat_frame['Took Percentage'] = shsat_frame['Number of students who took the SHSAT'] / shsat_frame['Number of students who registered for the SHSAT']
In [32]:
shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted.head()
Out[32]:
DBN School name Year of SHST Grade level Enrollment on 10/31 Number of students who registered for the SHSAT Number of students who took the SHSAT Took Percentage
0 05M046 P.S. 046 Arthur Tappan 8058 32 315 98 39 1.597034
1 05M123 P.S. 123 Mahalia Jackson 8058 32 241 84 37 1.700780
2 05M129 P.S. 129 John H. Finley 8058 32 252 108 48 2.396452
3 05M148 Eagle Academy for Young Men of Harlem 6047 25 222 12 11 1.857143
4 05M157 The Urban Assembly School for Global Commerce 8058 36 309 7 3 1.333333
In [33]:
shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted = shsat_sorted[['DBN','School name', 
                                 'Enrollment on 10/31', 
                                 'Number of students who registered for the SHSAT', 
                                 'Number of students who took the SHSAT']].join(shsat_frame[['Year of SHST', 
                                                                                          'Grade level']], how='inner')
shsat_sorted = shsat_sorted.sort_values('Number of students who registered for the SHSAT',ascending=False)
In [34]:
shsat_sorted.head()
Out[34]:
DBN School name Enrollment on 10/31 Number of students who registered for the SHSAT Number of students who took the SHSAT Year of SHST Grade level
22 84M336 KIPP Infinity Charter School 494 296 65 2016 8
29 84M726 KIPP STAR College Prep Charter School 292 292 96 2015 8
10 05M362 Columbia Secondary School 794 253 179 2015 8
20 84M065 Democracy Prep Endurance Charter School 567 194 94 2014 8
24 84M350 Democracy Prep Charter School 943 145 100 2014 8
In [35]:
shsat_sorted['Took Percentage'] = shsat_sorted['Number of students who took the SHSAT'] / shsat_sorted['Number of students who registered for the SHSAT']
In [36]:
sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))

# Plot the total schools per city
sns.set_color_codes("pastel")
sns.barplot(x="Number of students who registered for the SHSAT", y='School name', data=shsat_sorted,
            label="# of SHSAT Registrations", color="b", ci=None)

# Plot the total community schools per city
sns.set_color_codes("muted")
sns.barplot(x="Number of students who took the SHSAT", y="School name", data=shsat_sorted,
            label="# of Students who Took SHSAT", color="b",ci=None)

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 300), ylabel="School Name", title='SHSAT School Registration Distribution',
       xlabel="# of Registrations")
sns.despine(left=True, bottom=True)

plt.savefig('img/shsatregdist.png',bbox_inches='tight')
In [37]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
sns.barplot(y='School name', x='Took Percentage', data=shsat_sorted.sort_values('Took Percentage', ascending = False), ax=ax, orient='h', ci=None, color='b')
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x224673569e8>
In [38]:
shsat_low_appearance_df = shsat_sorted[shsat_sorted['Took Percentage'] <= 0.4]
shsat_low_appearance_df.sort_values('Took Percentage')
Out[38]:
DBN School name Enrollment on 10/31 Number of students who registered for the SHSAT Number of students who took the SHSAT Year of SHST Grade level Took Percentage
22 84M336 KIPP Infinity Charter School 494 296 65 2016 8 0.219595
6 05M286 I.S. M286 Renaissance Leadership Academy 165 109 28 2015 8 0.256881
7 05M286 Urban Assembly Academy for Future Leaders 93 90 25 2016 8 0.277778
12 05M367 Academy for Social Action: A College Board School 254 14 4 2015 8 0.285714
29 84M726 KIPP STAR College Prep Charter School 292 292 96 2015 8 0.328767
9 05M304 Mott Hall High School 504 3 1 2014 8 0.333333
0 05M046 P.S. 046 Arthur Tappan 315 98 39 2013 8 0.397959
In [39]:
poor_schools_shsat_performance = list(np.unique(shsat_sorted['DBN']))
# poor_schools_shsat_performance = [i.strip().lower() for i in poor_schools_shsat_performance]

total_schools = list(set(school_frame['Location Code']))
# total_schools = [i.strip().lower() for i in total_schools]

intersection = list(set(poor_schools_shsat_performance).intersection(set(total_schools)))

print('Number of Intersecting schools = {}'.format(len(intersection)))
Number of Intersecting schools = 21
In [40]:
percent_black_hist = []
for school in intersection:
    percent_black_hist.append(list(school_frame[school_frame['Location Code'] == school]['Percent Black / Hispanic'])[0])
  
plt.figure(figsize=(10,5))
sns.distplot(percent_black_hist, kde=False)

temp = plt.title('Percent Black / Hispanic in low SHSAT-taking schools', fontsize=15)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("Number of Schools", fontsize=15)


plt.savefig('img/percentbhshsat.png',bbox_inches='tight')
In [41]:
#As seen from the histogram above, the percentage of Black and Hispanic students is high where the percent of students taking the exam after registration is low.
In [42]:
#After School Programs 2.0 - School Performance Index
features_list = ['Rigorous Instruction %',
'Collaborative Teachers %',
'Supportive Environment %',
'Effective School Leadership %',
'Strong Family-Community Ties %',
'Trust %']

school_frame[['School Name'] + features_list ].head()
Out[42]:
School Name Rigorous Instruction % Collaborative Teachers % Supportive Environment % Effective School Leadership % Strong Family-Community Ties % Trust %
3 P.S. 034 FRANKLIN D. ROOSEVELT 85.0 78.0 82.0 73.0 89.0 88.0
8 P.S. 140 NATHAN STRAUS 94.0 91.0 85.0 87.0 83.0 93.0
10 P.S. 184M SHUANG WEN 90.0 81.0 91.0 67.0 83.0 85.0
11 P.S. 188 THE ISLAND SCHOOL 100.0 100.0 99.0 99.0 92.0 99.0
12 ORCHARD COLLEGIATE ACADEMY 72.0 77.0 77.0 72.0 76.0 87.0
In [43]:
#Looking at correlation of these features
school_frame[features_list].corr()
Out[43]:
Rigorous Instruction % Collaborative Teachers % Supportive Environment % Effective School Leadership % Strong Family-Community Ties % Trust %
Rigorous Instruction % 1.000000 0.757568 0.720514 0.643093 0.540730 0.708931
Collaborative Teachers % 0.757568 1.000000 0.594114 0.915435 0.485351 0.882959
Supportive Environment % 0.720514 0.594114 1.000000 0.488354 0.581007 0.714141
Effective School Leadership % 0.643093 0.915435 0.488354 1.000000 0.444356 0.853213
Strong Family-Community Ties % 0.540730 0.485351 0.581007 0.444356 1.000000 0.588877
Trust % 0.708931 0.882959 0.714141 0.853213 0.588877 1.000000
In [44]:
#From this we can see how correlated these features are to eachother

corr = school_frame[features_list].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='RdYlGn')
temp = plt.xticks(rotation=75, fontsize=15) 
temp = plt.yticks(fontsize=15) 

plt.savefig('img/schoolperformance.png',bbox_inches='tight')
In [45]:
#The highest correlation is between Effective School Leadership, Collaborative Teachers and, Trust 
#Principal Component Anaysis (PCA)' has been applied on the 3 features to get a single combined feature capturing the key characteristics of all those three features.

from sklearn.decomposition import PCA

correlated_features_list = ["Effective School Leadership %","Collaborative Teachers %","Trust %"]
corr_features_values = school_frame[correlated_features_list].values

pca = PCA(n_components=1)
combined_feature_value = pca.fit_transform(corr_features_values)
school_frame['PCA Combined Feature'] = combined_feature_value
#df_schools[correlated_features_list + ['PCA Combined Feature']].corr()
In [46]:
import sklearn

scaler = sklearn.preprocessing.MinMaxScaler()
scale_factor = 2*(school_frame['PCA Combined Feature'].corr(school_frame["Effective School Leadership %"])>0) -1 
school_frame['PCA Combined Feature'] =  scaler.fit_transform(scale_factor * school_frame['PCA Combined Feature'].values.reshape(-1,1))*100
In [47]:
print ("The correlation between the three correlated features and their PCA is shown below:")
school_frame[correlated_features_list + ['PCA Combined Feature']].corr()
The correlation between the three correlated features and their PCA is shown below:
Out[47]:
Effective School Leadership % Collaborative Teachers % Trust % PCA Combined Feature
Effective School Leadership % 1.000000 0.915435 0.853213 0.979714
Collaborative Teachers % 0.915435 1.000000 0.882959 0.970069
Trust % 0.853213 0.882959 1.000000 0.920314
PCA Combined Feature 0.979714 0.970069 0.920314 1.000000
In [48]:
#Weights to each factor for calculating School Performace Indicator:
#1.0 := Supportive Environment % 
#0.8 := Rigorous Instruction % 
#0.7 := PCA combined feature % 
#0.5 := Strong Family-Community Ties % 
#NOTE: PCA combined features incorporates these three features - Effective School Leadership %, Collaborative Teachers %, Trust %

features = ['Rigorous Instruction %','Supportive Environment %','PCA Combined Feature',
            'Strong Family-Community Ties %']
weights = [0.8, 1, 0.7, 0.5]

school_frame['SPI'] = school_frame[features].dot(weights)


print ("A glimpse of the School Performance Index (SPI) :")
school_frame[features+['SPI']].head(5)
A glimpse of the School Performance Index (SPI) :
Out[48]:
Rigorous Instruction % Supportive Environment % PCA Combined Feature Strong Family-Community Ties % SPI
3 85.0 82.0 78.859046 89.0 249.701332
8 94.0 85.0 90.408101 83.0 264.985671
10 90.0 91.0 76.608591 83.0 258.126014
11 100.0 99.0 100.000000 92.0 295.000000
12 72.0 77.0 77.852373 76.0 227.096661
In [49]:
#Separting schools into low and high performance groups

df_low_spi_schools = school_frame[school_frame['SPI'] < school_frame['SPI'].quantile(.25)]
df_high_spi_schools = school_frame[school_frame['SPI'] > school_frame['SPI'].quantile(.25)]
In [50]:
#fig = plot_city_hist(df_low_spi_schools, 'Distribution of low SPI schools by city')
#iplot(fig)

#We can see that there is high disparity in the ELA/Math proficiency amongst school students with high/low SPI.
In [51]:
print ("Average ELA / Math performance for the high SPI schools")
df_high_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
Average ELA / Math performance for the high SPI schools
Out[51]:
Average ELA Proficiency     2.592795
Average Math Proficiency    2.703537
dtype: float64
In [52]:
print ("Average ELA / Math performance for the low SPI schools")
df_low_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()
Average ELA / Math performance for the low SPI schools
Out[52]:
Average ELA Proficiency     2.366839
Average Math Proficiency    2.370323
dtype: float64
In [53]:
import base64
from IPython.display import HTML

def download_link(df, filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    title = "Download CSV file"
    html = '<button type="button" style="font-size: larger;  background-color: #FFFFFF; border: 0pt;"><a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a></button>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)
In [54]:
dfn = df_low_spi_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'low_spi_schools.csv')
(155, 160)
In [55]:
dfn = df_all_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'top_performing_students.csv')
(355, 161)